/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is TRECQuery.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
* Ben He <ben{a.}dcs.gla.ac.uk> (original author)
* Craig Macdonald <craigm{a.}dcs.gla.ac.uk>
*/
package org.terrier.structures;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.StringTokenizer;
import java.util.Vector;
import org.apache.log4j.Logger;
import org.terrier.applications.TRECQuerying.QuerySource;
import org.terrier.indexing.TRECFullTokenizer;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.Files;
import org.terrier.utility.TagSet;
/**
* This class is used for reading the queries
* from TREC topic files.
* <p><b>Properties:</b></p>
* <ul>
* <li><tt>trecquery.ignore.desc.narr.name.tokens</tt> - should the token DESCRIPTION and NARRATIVE in the desc and narr fields be ignored? Defaluts to true</li>
* <li><tt>tokeniser</tt> - name of the Tokeniser class to use to tokenise topics. Defaults to EnglishTokeniser.</li>
* <li><tt>trec.encoding</tt> - use to set the encoding of TREC topic files. Defaults to the systems default encoding.</li>
* </ul>
* @author Ben He & Craig Macdonald
*/
public class TRECQuery implements QuerySource {
/** The logger used for this class */
protected static final Logger logger = Logger.getLogger(TRECQuery.class);
/** Value of <tt>trecquery.ignore.desc.narr.name.tokens</tt> - should the token DESCRIPTION and NARRATIVE in the desc and narr fields be ignored? Defaluts to true? */
protected static final boolean IGNORE_DESC_NARR_NAME_TOKENS =
Boolean.parseBoolean(ApplicationSetup.getProperty("trecquery.ignore.desc.narr.name.tokens","true"));
/** Encoding to be used to open all files. */
protected static String desiredEncoding = ApplicationSetup.getProperty("trec.encoding", Charset.defaultCharset().name());
/** The topic files used in this object */
protected String[] topicFiles;
/** The queries in the topic files.*/
protected String[] queries;
/** The query identifiers in the topic files.*/
protected String[] query_ids;
/** The index of the queries.*/
protected int index;
/**
* Extracts and stores all the queries from query files.
* @param queryfilenames String the name of files containing topics.
* @param vecStringQueries Vector a vector containing the
* queries as strings.
* @param vecStringIds Vector a vector containing the query
* identifiers as strings.
* @return boolean true if some queries were successfully extracted.
*/
public boolean extractQuery(String[] queryfilenames, Vector<String> vecStringQueries, Vector<String> vecStringIds)
{
boolean rtn = false;
for (int i=0;i<queryfilenames.length;i++) {
if (extractQuery(queryfilenames[i], vecStringQueries, vecStringIds))
rtn = true;
}
return rtn;
}
/**
* Extracts and stores all the queries from a query file.
* @param queryfilename String the name of a file containing topics.
* @param vecStringQueries Vector a vector containing the
* queries as strings.
* @param vecStringIds Vector a vector containing the query
* identifiers as strings.
* @return boolean true if some queries were successfully extracted.
*/
public boolean extractQuery(String queryfilename, Vector<String> vecStringQueries, Vector<String> vecStringIds)
{
boolean gotSome = false;
try {
BufferedReader br;
if (! Files.exists(queryfilename) || ! Files.canRead(queryfilename)) {
logger.error("The topics file " + queryfilename + " does not exist, or it cannot be read.");
return false;
} else {
br = Files.openFileReader(queryfilename,desiredEncoding);
TRECFullTokenizer queryTokenizer = new TRECFullTokenizer(
new TagSet(TagSet.TREC_QUERY_TAGS),
new TagSet(TagSet.EMPTY_TAGS),
br);
queryTokenizer.setIgnoreMissingClosingTags(true);
while (!queryTokenizer.isEndOfFile()) {
String docnoToken = null;
StringBuilder query = new StringBuilder();
boolean seenDescriptionToken = ! IGNORE_DESC_NARR_NAME_TOKENS;
boolean seenNarrativeToken = ! IGNORE_DESC_NARR_NAME_TOKENS;
while (!queryTokenizer.isEndOfDocument()) {
String token = queryTokenizer.nextToken();
if (token == null
|| token.length() == 0
|| queryTokenizer.inTagToSkip())
continue;
if (queryTokenizer.inDocnoTag()) {
//The tokenizer is constructed from the trimmed version of the contents
//of the query number tag, so that the last token extracted from it, is
//always the query number, and not an empty string
StringTokenizer docnoTokens =
new StringTokenizer(token.trim(), " ");
while (docnoTokens.hasMoreTokens())
docnoToken = docnoTokens.nextToken().trim();
} else if (queryTokenizer.inTagToProcess()) {
// Removed the code that checks if "description" and
// "narrative" appear in "desc" and "narr", respective.
// THIS WILL HURT THE RETRIEVAL PERFORMANCE. Therefore,
// it is recommended to add these words in the stopword
// list.
if (!seenDescriptionToken && queryTokenizer
.currentTag()
.toUpperCase()
.equals("DESC")
&& token.toUpperCase().equals("DESCRIPTION"))
continue;
if (!seenNarrativeToken && queryTokenizer
.currentTag()
.toUpperCase()
.equals("NARR")
&& token.toUpperCase().equals("NARRATIVE"))
continue;
query.append(token);
query.append(' ');
}
}
queryTokenizer.nextDocument();
if (query.length() == 0)
continue;
vecStringQueries.add(query.toString().trim());
vecStringIds.add(docnoToken.trim());
gotSome = true;
}
//after processing each query file, close the BufferedReader
br.close();
}
}catch (IOException ioe) {
logger.error("Input/Output exception while extracting queries from the topic file named "+queryfilename, ioe);
}
return gotSome;
}
/**
* Constructs an instance of TRECQuery,
* that reads and stores all the queries from
* the files defined in the trec.topics property. */
public TRECQuery() {
//this(ApplicationSetup.getProperty("trec.topics", null));
try {
String files[] = ArrayUtils.parseCommaDelimitedString(ApplicationSetup.getProperty("trec.topics", ""));
assert files.length > 0;
Vector<String> vecStringQueries = new Vector<String>();
Vector<String> vecStringQueryIDs = new Vector<String>();
Vector<String> vecStringFiles = new Vector<String>();
for (int i=0; i<files.length;i++) {
if (this.extractQuery(files[i], vecStringQueries, vecStringQueryIDs)) {
vecStringFiles.add(files[i]);
}
}
this.topicFiles = vecStringQueries.toArray(new String[0]);
this.queries = vecStringQueries.toArray(new String[0]);
this.query_ids = vecStringQueryIDs.toArray(new String[0]);
this.index = 0;
} catch (Exception ioe) {
logger.error("Problem getting trec.topics property:", ioe);
return;
}
}
/**
* Constructs an instance of TRECQuery that
* reads and stores all the queries from a
* the specified query file.
* @param queryfile File the file containing the queries.
*/
public TRECQuery(File queryfile){
this(queryfile.getName());
}
/**
* Constructs an instance of TRECQuery that
* reads and stores all the queries from
* the specified query files.
* @param queryfiles File the file containing the queries.
*/
public TRECQuery(File[] queryfiles){
Vector<String> vecStringQueries = new Vector<String>();
Vector<String> vecStringQueryIDs = new Vector<String>();
String[] files = new String[queryfiles.length];
for (int i=0;i<queryfiles.length;i++)
files[i] = queryfiles[i].getName();
if (this.extractQuery(files, vecStringQueries, vecStringQueryIDs))
this.topicFiles = files;
if (topicFiles == null)
logger.error("Topic files were specified, but non could be parsed correctly to obtain any topics."
+ " Check you have the correct topic files specified, and that TrecQueryTags properties are correct.");
this.queries = vecStringQueries.toArray(new String[0]);
this.query_ids = vecStringQueryIDs.toArray(new String[0]);
this.index = 0;
}
/**
* Constructs an instance of TRECQuery that
* reads and stores all the queries from a
* file with the specified filename.
* @param queryfilename String the name of the file containing
* all the queries.
*/
public TRECQuery(String queryfilename){
Vector<String> vecStringQueries = new Vector<String>();
Vector<String> vecStringQueryIDs = new Vector<String>();
if (this.extractQuery(queryfilename, vecStringQueries, vecStringQueryIDs))
this.topicFiles = new String[]{queryfilename};
if (topicFiles == null)
logger.error("Topic files were specified, but non could be parsed correctly to obtain any topics."
+ " Check you have the correct topic files specified, and that TrecQueryTags properties are correct.");
this.queries = vecStringQueries.toArray(new String[0]);
this.query_ids = vecStringQueryIDs.toArray(new String[0]);
this.index = 0;
}
/**
* Constructs an instance of TRECQuery that
* reads and stores all the queries from
* files with the specified filename.
* @param queryfilenames String[] the name of the files containing
* all the queries.
*/
public TRECQuery(String[] queryfilenames){
Vector<String> vecStringQueries = new Vector<String>();
Vector<String> vecStringQueryIDs = new Vector<String>();
if (this.extractQuery(queryfilenames, vecStringQueries, vecStringQueryIDs))
this.topicFiles = queryfilenames;
if (topicFiles == null)
logger.error("Topic files were specified, but non could be parsed correctly to obtain any topics."
+ " Check you have the correct topic files specified, and that TrecQueryTags properties are correct.");
this.queries = vecStringQueries.toArray(new String[0]);
this.query_ids = vecStringQueryIDs.toArray(new String[0]);
this.index = 0;
}
// /**
// * @deprecated As of Terrier 3.5
// * Extracts and stores all the queries from
// * the topic files, specified in the file
// * with default name <tt>trec.topics.list</tt>.
// */
// protected void extractQuery() {
// try {
// //open the query file
// BufferedReader addressQueryFile = Files.openFileReader(ApplicationSetup.TREC_TOPICS_LIST);
// ArrayList<String> parsedTopicFiles = new ArrayList<String>(1);
// String queryFilename;
// Vector<String> vecStringQueries = new Vector<String>();
// Vector<String> vecStringQueryIDs = new Vector<String>();
// int fileCount = 0;
// while ((queryFilename = addressQueryFile.readLine()) != null) {
// if (queryFilename.startsWith("#") || queryFilename.equals(""))
// continue;
// //logger.info("Extracting queries from "+queryFilename);
// fileCount++;
// boolean rtr = extractQuery(queryFilename, vecStringQueries, vecStringQueryIDs);
// if (rtr)
// parsedTopicFiles.add(queryFilename);
// }
// if (fileCount ==0)
// {
// logger.error("No topic files found in "+ApplicationSetup.TREC_TOPICS_LIST +" - please check");
// }
// if (fileCount > 0 && parsedTopicFiles.size() == 0)
// {
// logger.error("Topic files were specified, but non could be parsed correctly to obtain any topics."
// + " Check you have the correct topic files specified, and that TrecQueryTags properties are correct.");
// }
// this.queries = (String[]) vecStringQueries.toArray(new String[0]);
// this.query_ids = (String[]) vecStringQueryIDs.toArray(new String[0]);
// this.topicFiles = (String[]) parsedTopicFiles.toArray(new String[0]);
// ////logger.info("found files ="+ this.topicFiles.length);
// addressQueryFile.close();
// } catch (IOException ioe) {
// logger.error("Input/Output exception while performing the matching.", ioe);
// }
// }
/**
* Returns the index of the last obtained query.
* @return int the index of the last obtained query.
*/
public int getIndexOfCurrentQuery() {
return index - 1;
}
/**
* Returns the number of the queries read from the
* processed topic files.
* @return int the number of topics contained in the
* processed topic files.
*/
public int getNumberOfQueries() {
return queries.length;
}
/** Returns the filenames of the topic files from which the queries were extracted */
public String[] getInfo()
{
return this.topicFiles;
}
/** @deprecated */
public String[] getTopicFilenames() {
return getInfo();
}
/**
* Return the query for the given query number.
* @return String the string representing the query.
* @param queryNo String The number of a query.
*/
public String getQuery(String queryNo) {
for (int i = 0; i < query_ids.length; i++)
if (query_ids[i].equals(queryNo))
return queries[i];
return null;
}
/**
* Test if there are more queries to process.
* @return boolean true if there are more queries
* to process, otherwise returns false.
* @deprecated
*/
public boolean hasMoreQueries() {
return hasNext();
}
/**
* {@inheritDoc}
*/
public boolean hasNext()
{
if (index == queries.length)
return false;
return true;
}
/**
* Returns a query.
* @return String the next query.
* @deprecated
*/
public String nextQuery() {
return next();
}
/**
* {@inheritDoc}
*/
public String next()
{
if (index == queries.length)
return null;
return queries[index++];
}
/** {@inheritDoc} */
public String getQueryId() {
return query_ids[index == 0 ? 0 : index-1];
}
/** Returns the query ids
* @return String array containing the query ids.
* @since 2.2 */
public String[] getQueryIds()
{
return query_ids;
}
/**
* Returns the queries in an array of strings
* @return String[] an array containing the strings that
* represent the queries.
*/
public String[] toArray() {
return (String[]) queries.clone();
}
/** {@inheritDoc} */
public void reset() {
this.index = 0;
}
/**
* {@inheritDoc}
*/
public void remove() {
throw new UnsupportedOperationException();
}
/**
* main
* @param args
*/
public static void main(String[] args)
{
TRECQuery source = new TRECQuery(args[0]);
while(source.hasNext())
{
String query = source.next();
String id = source.getQueryId();
System.out.println(id + ": " + query);
}
}
}